
local dir `c(pwd)'

use "`dir'\corpus\eng-x-bible-kingjames", clear
drop in 1/8
gen sentid=_n
	gen sentencelength=wordcount(sentence)	
 	expand sentencelength
	sort sentid
	bysort sentid: gen wordid=_n
	gen word=word(sentence,wordid)
	compress	
	keep word
	drop if word==""|word==" "

gen order=_n
drop if word=="﻿﻿"
preserve
gen match=1
collapse (sum) match, by(word)
gsort -match
gen rank=_n
save rank, replace
restore
merge m:1 word using rank
sort word order
 
by word: gen id=_n
sort order
gen zipf=string(rank)
replace zipf=zipf+"_"+word if id==1

replace zipf=zipf+" "
gen length_zipf=ustrlen(zipf)
sum length_zipf
ret list
local nom=r(sum)
gen zip_word=word
replace word=word+" "
gen length_word=ustrlen(word)
sum length_word
ret list
di `nom'/r(sum)

outfile zip_word using "intro.txt", noquote replace
zipfile intro.txt, saving(intro.zip, replace)
checksum intro.txt
					local unzipped=r(filelen)   
checksum intro.zip
					local zipped=r(filelen)   
display `zipped'/`unzipped'


exit
local dir `c(pwd)'
use "`dir'\stata_files\length_eng-x-bible-kingjames_40_original", clear			
gen one=1 if v1=="e"&v1[_n+1]=="d"&v1[_n+2]==" "&v1[_n+3]=="t"&v1[_n+4]=="h"&v1[_n+5]=="a"
list if one==1&_n>100000	
exit
*positions: 127348 they perceived that li==9 | 125150 they supposed that	
